{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 处理数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zwb/.cache/kgenv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2024-06-23 17:30:06.516442: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2024-06-23 17:30:06.568119: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-06-23 17:30:07.666180: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from datasets import Dataset\n",
    "import pandas as pd\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig\n",
    "tokenizer = AutoTokenizer.from_pretrained('qwen/Qwen2-7B-Instruct')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'input': '我想送赵老师生日祝福,严肃风格',\n",
       "  'output': '尊敬的赵老师，值此生辰之际，愿岁月如诗，为您带来无尽的喜悦与美好；愿时光荏苒，为您留下珍贵的回忆与感悟。愿您快快乐乐，事业更上一层楼，教诲之恩桃李满天下。在这充满敬意的时刻，恭祝赵老师生日快乐，幸福安康！',\n",
       "  'system': '你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福'}]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_json('./dataset/tianji-wishes-chinese-v0.1.json')\n",
    "ds = Dataset.from_pandas(df)\n",
    "ds['conversation'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'system'],\n",
       "    num_rows: 2976\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 转换为 DataFrame 然后转换为 Dataset\n",
    "df = pd.DataFrame(ds)\n",
    "df = df.explode('conversation')  # 展平 'conversation' 列\n",
    "df = pd.json_normalize(df['conversation'])  # 将嵌套的 JSON 展平\n",
    "dataset = Dataset.from_pandas(df)\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input': '我想送赵老师生日祝福,严肃风格',\n",
       " 'output': '尊敬的赵老师，值此生辰之际，愿岁月如诗，为您带来无尽的喜悦与美好；愿时光荏苒，为您留下珍贵的回忆与感悟。愿您快快乐乐，事业更上一层楼，教诲之恩桃李满天下。在这充满敬意的时刻，恭祝赵老师生日快乐，幸福安康！',\n",
       " 'system': '你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_func(example):\n",
    "    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性\n",
    "    input_ids, attention_mask, labels = [], [], []\n",
    "    instruction = tokenizer(f\"<|im_start|>system\\n你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福<|im_end|>\\n<|im_start|>user\\n{example['input']}<|im_end|>\\n<|im_start|>assistant\\n\", add_special_tokens=False)  \n",
    "    response = tokenizer(f\"{example['output']}\", add_special_tokens=False)\n",
    "    input_ids = instruction[\"input_ids\"] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
    "    attention_mask = instruction[\"attention_mask\"] + response[\"attention_mask\"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1\n",
    "    labels = [-100] * len(instruction[\"input_ids\"]) + response[\"input_ids\"] + [tokenizer.pad_token_id]  \n",
    "    if len(input_ids) > MAX_LENGTH:  \n",
    "        input_ids = input_ids[:MAX_LENGTH]\n",
    "        attention_mask = attention_mask[:MAX_LENGTH]\n",
    "        labels = labels[:MAX_LENGTH]\n",
    "    return {\n",
    "        \"input_ids\": input_ids,\n",
    "        \"attention_mask\": attention_mask,\n",
    "        \"labels\": labels\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 2976/2976 [00:01<00:00, 2706.16 examples/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input_ids', 'attention_mask', 'labels'],\n",
       "    num_rows: 2976\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_id = dataset.map(process_func, remove_columns=dataset.column_names)\n",
    "tokenized_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<|im_start|>system\\n你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福<|im_end|>\\n<|im_start|>user\\n我想送赵老师生日祝福,严肃风格<|im_end|>\\n<|im_start|>assistant\\n尊敬的赵老师，值此生辰之际，愿岁月如诗，为您带来无尽的喜悦与美好；愿时光荏苒，为您留下珍贵的回忆与感悟。愿您快快乐乐，事业更上一层楼，教诲之恩桃李满天下。在这充满敬意的时刻，恭祝赵老师生日快乐，幸福安康！<|endoftext|>'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(tokenized_id[0]['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'赵老师，值此春节佳节之际，恭祝您福寿安康，万事如意。在过去的一年里，您的辛勤耕耘为后辈树立了榜样，新春到来，愿您的生活如诗如画，工作更上一层楼，继续以您的智慧和热忱，引领我们前行。岁月静好，愿您享受每一个温馨时刻，幸福安康，喜悦无忧。<|endoftext|>'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1][\"labels\"])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 创建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.88s/it]\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "# 使用 `bitsandbytes` 进行量化\n",
    "model_name = 'qwen/Qwen2-7B-Instruct'\n",
    "\n",
    "bnb_config = {\n",
    "    'load_in_4bit': True,  # 启用 4-bit 量化\n",
    "    'bnb_4bit_compute_dtype': torch.bfloat16,  # 计算使用 bfloat16\n",
    "    'bnb_4bit_use_double_quant': True,  # 使用双重量化\n",
    "    'bnb_4bit_quant_type': 'nf4',  # 量化类型，nf4\n",
    "}\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config,device_map=\"auto\",torch_dtype=torch.bfloat16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.enable_input_require_grads()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.bfloat16"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.dtype"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Qlora "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from peft import LoraConfig, TaskType, get_peft_model\n",
    "\n",
    "# QLoRA 配置\n",
    "config = LoraConfig(\n",
    "    task_type=TaskType.CAUSAL_LM,\n",
    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
    "    inference_mode=False,\n",
    "    r=8,\n",
    "    lora_alpha=32,\n",
    "    lora_dropout=0.1\n",
    ")\n",
    "\n",
    "# 应用 QLoRA 配置\n",
    "model = get_peft_model(model, config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643\n"
     ]
    }
   ],
   "source": [
    "model.print_trainable_parameters()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 配置训练参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    output_dir=\"./output/Qwen2_instruct_Qlora\",\n",
    "    per_device_train_batch_size=4,\n",
    "    gradient_accumulation_steps=4,\n",
    "    logging_steps=10,\n",
    "    num_train_epochs=3,\n",
    "    save_steps=100, \n",
    "    learning_rate=1e-4,\n",
    "    save_on_each_node=True,\n",
    "    gradient_checkpointing=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    train_dataset=tokenized_id,\n",
    "    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 合并加载模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.96s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型和 QLoRA 权重加载成功！\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from peft import PeftModel\n",
    "import torch\n",
    "import bitsandbytes as bnb\n",
    "\n",
    "# 设置路径\n",
    "lora_path = './output/Qwen2_instruct_Qlora/checkpoint-500'  # 你的 QLoRA 权重路径\n",
    "model_name = 'qwen/Qwen2-7B-Instruct'  # 基础模型\n",
    "\n",
    "# 加载分词器\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "\n",
    "# 配置量化\n",
    "bnb_config = {\n",
    "    'load_in_4bit': True,  # 启用 4-bit 量化\n",
    "    'bnb_4bit_compute_dtype': torch.bfloat16,  # 计算使用 bfloat16\n",
    "    'bnb_4bit_use_double_quant': True,  # 使用双重量化\n",
    "    'bnb_4bit_quant_type': 'nf4',  # 量化类型，nf4\n",
    "}\n",
    "\n",
    "# 加载基础模型并应用量化\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    device_map=\"auto\",  # 自动分配设备\n",
    "    torch_dtype=torch.bfloat16,  # 使用 bfloat16\n",
    "    trust_remote_code=True,  # 信任远程代码\n",
    "    quantization_config=bnb_config  # 应用量化配置\n",
    ").eval()\n",
    "\n",
    "# 加载 QLoRA 权重\n",
    "model = PeftModel.from_pretrained(model, lora_path)\n",
    "\n",
    "# 模型设置为推理模式\n",
    "model.eval()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型推理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "愿端午佳节，艾叶飘香，龙舟竞渡，为你带来吉祥与安康。愿你在学习之余，享受传统节日的韵味，身心愉悦，才思泉涌，前程似锦。祝郑同学端午节快乐，万事如意！\n"
     ]
    }
   ],
   "source": [
    "prompt = \"我想祝福郑同学端午节快乐，严肃风格\"\n",
    "inputs = tokenizer.apply_chat_template([{\"role\": \"user\", \"content\": \"你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福。\"},{\"role\": \"user\", \"content\": prompt}],\n",
    "                                       add_generation_prompt=True,\n",
    "                                       tokenize=True,\n",
    "                                       return_tensors=\"pt\",\n",
    "                                       return_dict=True\n",
    "                                       ).to('cuda')\n",
    "\n",
    "\n",
    "gen_kwargs = {\"max_length\": 2500, \"do_sample\": True, \"top_k\": 1}\n",
    "with torch.no_grad():\n",
    "    outputs = model.generate(**inputs, **gen_kwargs)\n",
    "    outputs = outputs[:, inputs['input_ids'].shape[1]:]\n",
    "    print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kgenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
